Open In Colab

1. Two-dimensional data¶

In [ ]:
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import pandas as pd
from typing import List
from numpy.linalg import norm
from IPython.core.formatters import Dict
from scipy.optimize import minimize
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
import numpy as np
from matplotlib.patches import Ellipse
import pickle
In [ ]:
# Generate the 2D dataset using sklearn
# Create the moon datasets of varying sizes
n_samples_small = 100
n_samples_medium = 500
n_samples_large = 1000

#let's plot the dataset["medium"]
# X, _ = make_moons(n_samples=500, noise=0.05, random_state=0)
Xsmall,_=make_moons(n_samples=n_samples_small, noise=0.05, random_state=0)
Xmid,_=make_moons(n_samples=n_samples_medium, noise=0.05, random_state=0)
Xlarge,_=make_moons(n_samples=n_samples_large, noise=0.05, random_state=0)

datasets=datasets = {
    "Small": Xsmall,
    "Medium": Xmid,
    "Large": Xlarge,
}
# Plot any of the dataset
plt.figure(figsize=(8, 6))
plt.scatter(datasets["Medium"][:,0], datasets["Medium"][:,1], c='blue', label='make_moons dataset', alpha=0.5)
Out[ ]:
<matplotlib.collections.PathCollection at 0x7c741e2df6a0>

1.1 Histogram¶

In [ ]:
def train_histogram(X, bins=20):
    x_min, x_max = X[:, 0].min(), X[:, 0].max()
    y_min, y_max = X[:, 1].min(), X[:, 1].max()

    x_bins = np.linspace(x_min, x_max, bins + 1)
    y_bins = np.linspace(y_min, y_max, bins + 1)

    histogram = np.zeros((bins, bins), dtype=int)

    for i in range(bins):
        for j in range(bins):
            x_condition = np.logical_and(x_bins[i] <= X[:, 0], X[:, 0] < x_bins[i + 1])
            y_condition = np.logical_and(y_bins[j] <= X[:, 1], X[:, 1] < y_bins[j + 1])
            bin_count = np.sum(np.logical_and(x_condition, y_condition))
            histogram[i, j] = bin_count

    return histogram, x_bins, y_bins

# Train histogram for a specific dataset (e.g., the small one)
X_small = datasets["Large"]
histogram_small, x_bins, y_bins = train_histogram(X_small)

# You can visualize the histogram if needed
plt.imshow(histogram_small, origin='lower', cmap='Blues')
plt.colorbar()
plt.title('2D Histogram')
plt.show()
In [ ]:
def generate_samples_from_histogram(histogram, x_bins, y_bins, num_samples):
    # Normalize the histogram to obtain a probability distribution
    histogram_normalized = histogram / np.sum(histogram)

    # Get the shape of the histogram
    num_x_bins, num_y_bins = histogram.shape

    # Generate random indices based on the normalized histogram
    random_indices = np.unravel_index(
        np.random.choice(histogram.size, size=num_samples, p=histogram_normalized.ravel()),
        (num_x_bins, num_y_bins)
    )

    # Map the random indices to bin centers
    x_min, x_max = x_bins[0], x_bins[-1]
    y_min, y_max = y_bins[0], y_bins[-1]
    x_bin_centers = (x_bins[:-1] + x_bins[1:]) / 2
    y_bin_centers = (y_bins[:-1] + y_bins[1:]) / 2
    x_samples = x_bin_centers[random_indices[0]]
    y_samples = y_bin_centers[random_indices[1]]

    return np.column_stack((x_samples, y_samples))

# Example of generating data samples from the small histogram
num_samples_to_generate = 1000
generated_samples = generate_samples_from_histogram(histogram_small, x_bins, y_bins, num_samples_to_generate)

plt.scatter(generated_samples[:, 0], generated_samples[:, 1], c='red', marker='x', label='Generated Samples')

# You can also overlay the original data points for comparison
plt.scatter(X_small[:, 0], X_small[:, 1], c='blue', marker='o', label='Original Data')

# Set labels and legend
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.legend()

# Show the plot
plt.show()

1.2 Single Gaussian Distribution model¶

In [ ]:
### define n- dimensional gaussian distribution
def gaussian_multivar(x:np.array,mean:np.array,cov_mat:np.array):
  '''
  This model will evaluate the probabilty at the given x_vector using the GMM model.
  Args:
  1. x= np array of shape (1 X dimn_of_the_problem)
  2. mean= np array of shape (1 X dimn_of_the_problem)
  3.cov_mat= square np array of shape (dimn_of_the_prob)
  '''
  dimn=np.shape(x)[0]
  inv_cov=np.linalg.inv(cov_mat)
  x_minus_mu=x-mean
  # argument for the exp
  arg_for_expn=0.5*(x_minus_mu @ inv_cov @ np.transpose(x_minus_mu))[0,0]
  det_cov=np.linalg.det(cov_mat)
  # evaluating the prob
  prob=np.exp(-arg_for_expn) * (1./(np.sqrt(det_cov*((2*np.pi)**dimn))))

  return prob
In [ ]:
import numpy as np
from scipy.linalg import cholesky
from scipy.special import erfinv

def sample_multivariate_gaussian(mean, covariance, num_samples=1):
    """
    Sample from an n-dimensional Gaussian distribution.

    Args:
        mean: Mean vector (1D array) of length n. (1,)
        covariance: Covariance matrix (2D array) of shape (n, n).
        num_samples: Number of samples to generate (default is 1).

    Returns:
        samples: Array of shape (num_samples, n) containing generated samples.
    """
    n = len(mean)

    # Generate random vectors 'u' from a uniform distribution
    u = np.random.rand(num_samples, n)

    # Use the inverse error function (probit function) to transform 'u' into standard normal samples
    z_standard_normal = np.sqrt(2) * erfinv(2 * u - 1)

    # Apply the Cholesky decomposition to the covariance matrix
    L = cholesky(covariance, lower=True)

    # Transform standard normal samples into multivariate Gaussian samples
    samples = np.dot(z_standard_normal, L.T) + mean

    return samples

# Example usage:
# mean_vector = np.array([0.0, 0.0])
# covariance_matrix = np.array([[1.0, 0.5], [0.5, 1.0]])
# generated_samples = sample_multivariate_gaussian(mean_vector, covariance_matrix, num_samples=100)

Fitting the single (2-D) gaussina distribution to our data.¶

in our case, the best fitting parameter would be the mean vector and the covariance matrix of the given dataset

In [ ]:
def fit_gaussian_to_data(X):
    # Calculate the mean and covariance matrix
    mean = np.mean(X, axis=0)
    cov_matrix = np.cov(X, rowvar=False)

    return (mean, cov_matrix)
In [ ]:
fitted_params={}
fitted_params['Small']=fit_gaussian_to_data(datasets["Small"])
fitted_params['Medium']=fit_gaussian_to_data(datasets["Medium"])
fitted_params['Large']=fit_gaussian_to_data(datasets["Large"])
In [ ]:
sample_small=sample_multivariate_gaussian(fitted_params['Small'][0], fitted_params['Small'][1],num_samples=100)
sample_medium=sample_multivariate_gaussian(fitted_params['Small'][0], fitted_params['Small'][1],num_samples=100)
sample_high=sample_multivariate_gaussian(fitted_params['Small'][0], fitted_params['Small'][1],num_samples=100)
In [ ]:
# Plot
plt.figure(figsize=(8, 6))
plt.scatter(datasets['Small'][:,0], datasets['Small'][:, 1], c='blue', label='make_moons dataset: small', alpha=0.5)
plt.scatter(sample_small[:,0],sample_small[:,1],c='red',label="sampled",alpha=0.5)
plt.title("gaussian fitted to 100 datapoints")
plt.legend()
plt.show()

plt.figure(figsize=(8, 6))
plt.scatter(datasets['Medium'][:,0], datasets['Medium'][:, 1], c='blue', label='make_moons dataset: medium', alpha=0.5)
plt.scatter(sample_medium[:,0],sample_medium[:,1],c='red',label="sampled",alpha=0.5)
plt.title("gaussian fitted to 500 datapoints")
plt.legend()
plt.show()

plt.figure(figsize=(8, 6))
plt.scatter(datasets['Large'][:,0], datasets['Large'][:, 1], c='blue', label='make_moons dataset: large', alpha=0.5)
plt.scatter(sample_high[:,0],sample_high[:,1],c='red',label="sampled",alpha=0.5)
plt.title("gaussian fitted to 1000 datapoints")
plt.legend()
plt.show()

1.3 Gaussian mixture model (GMM)¶

In [ ]:
def gaussian_mixture_model(x_input:np.array, mixing_coeffs:np.array, mean_array:np.array, covs:Dict):
  '''
  Args:
  1. x_input: x vector at which one wants to evaluate the GMM model.
              A numpy array (row vector) of shape (1 X dimn of the problem)
  2. mean:  A numpy arrray of shape (L X dimn of the problem instance)
  3. mixing coeffs: a numpy array of shape (1 X L)
  4. covs: dict of covaraince matrices, L number of entries, one for each of the P_l(x).
  '''
  L=np.shape(mean_array)[0]
  dimn_of_the_prob=np.shape(x_input)[1]
  list_gmm=[mixing_coeffs[0,idx_el] * gaussian_multivar(x=x_input, mean=mean_array[idx_el,:],cov_mat=covs[idx_el])
            for idx_el in range(0,L)
            ]
  prob=sum(list_gmm)## sometimes instead of 1./num_data , literature uses the factor 1./(bandwidth X num_data)
  return prob

EM algorithm for training the params for the GMM¶

In [ ]:
def EM_training(mean:np.array,mixing_coeffs:np.array,covs:Dict, dataset_input:np.array,num_iters:int=500,tol:float=0.0001):
  '''
  Args:
  mean= a numpy array
        shape(mean)= (L X dimension_of_the_problem_instance (whether 1D, 2D, 3D and so on))
  mixing_coeffs=
        shape(mixing_coeffs)= (1 X L)
  covs= dict of covariance matrices.
        shape(covs)= L number of entries, one for each of the P_l(x)
  dataset_input: shape would be (num of data set X dimn_of_the_prob)
  '''
  L=np.shape(mean)[0]
  print("L is:");print(L)
  dimn_of_prob=np.shape(dataset_input)[1]
  # define gamma: responsibility
  num_data=np.shape(dataset_input)[0] ### note shape(dataset_input)=(num data X dimn_of_the_problem)
  gamma=np.zeros(shape=(num_data,L))
  prev_log_likelihood = -np.inf  # Initialize the log-likelihood to negative infinity
  ###
  for t in range(0,num_iters):

    ### evaluate the E step
    # update the gamma.
    for idx_over_data in range(0,num_data):
      prod_mixing_coeffs_and_gaussian_l=[mixing_coeffs[0,idx_el]*gaussian_multivar(x=dataset_input[idx_over_data,:].reshape((1,dimn_of_prob)),mean=mean[idx_el,:].reshape((1,dimn_of_prob)),cov_mat=covs[idx_el]) for idx_el in range(0,L)]
      Dr_for_gamma=np.sum(prod_mixing_coeffs_and_gaussian_l)
      gamma[idx_over_data,:]=[prod_mixing_coeffs_and_gaussian_l[k]/Dr_for_gamma for k in range(0,L)]

    #print("gamma obtained:");print(gamma)
    ### M-step: Re-estimating the parameters
    N_l=np.sum(gamma,axis=0)# this is the expression sum_{i=1}^{N(all dataset)} gamma_{il} in the lecture notes
    #print("N_l is:");print(N_l)# NOTE:: shape of N_l = (L,) (note that this array is 1D)

    ### 1. update means: the expresion is like a wtd sum of the vectors
    for idx_el in range(0,L):
      #print("index el is:");print(idx_el)
      temp_vec=np.array(
          [gamma[idx_over_data,idx_el]*dataset_input[idx_over_data,:]
           for idx_over_data in range(0,num_data)]
          )
      mean[idx_el,:]=(1./N_l[idx_el])*np.sum(temp_vec,axis=0)
    #print("mean matrix after update:");print(mean)

    ### 2. update the covariance matrix:
    ##### formula for updating the cov mat is:
    ##### sigma_lth=1/N_l[idx_el] * sum_{i=1}^{N(all_dataset)} * gamma_{il}*some_outer_product
    for idx_el in range(0,L):
      #print("updating cov matrix, index el is:",idx_el)
      temp_outer_prod_list=np.array(
          [gamma[idx_over_data,idx_el]*(dataset_input[idx_over_data,:]-mean[idx_el,:]).reshape((dimn_of_prob,1)) @
           (dataset_input[idx_over_data,:]-mean[idx_el,:]).reshape((1,dimn_of_prob))
          for idx_over_data in range(0,num_data)
         ]
          )
      covs[idx_el]=(1./N_l[idx_el])*sum(temp_outer_prod_list)
    #print("updated covriances");print(covs)

    ### 3. update the mixing coeffs:
    for idx_el in range(0,L):
      mixing_coeffs[0,idx_el]=N_l[idx_el]/num_data

    # Calculate the log-likelihood for the current parameters
    current_log_likelihood = 0
    for idx_over_data in range(num_data):
        log_sum = np.log(sum([mixing_coeffs[0, idx_el] * gaussian_multivar(x=dataset_input[idx_over_data, :].reshape((1, dimn_of_prob)), mean=mean[idx_el, :].reshape((1, dimn_of_prob)), cov_mat=covs[idx_el]) for idx_el in range(L)]))
        current_log_likelihood += log_sum

    # Check for convergence
    if np.abs(current_log_likelihood - prev_log_likelihood) < tol:
        break

    prev_log_likelihood = current_log_likelihood

  return mean,covs,mixing_coeffs
In [ ]:
def fit_the_GMM(X, L):

  # Define your Gaussian function for multivariate data
  def gaussian_multivar(x, mean, cov_mat):
    dimn = np.shape(x)[1]
    inv_cov = np.linalg.inv(cov_mat)
    x_minus_mu = x - mean
    arg_for_expn = 0.5 * np.sum(x_minus_mu @ inv_cov * x_minus_mu, axis=1)
    det_cov = np.linalg.det(cov_mat)
    prob = np.exp(-arg_for_expn) / (np.sqrt(det_cov) * ((2 * np.pi) ** (dimn / 2)))
    return prob

  dimn_of_prob = X.shape[1]

  # Initial guess for GMM parameters
  initial_means = np.random.rand(L, dimn_of_prob)
  initial_covs = [np.eye(dimn_of_prob) for _ in range(L)]
  initial_mixing_coeffs = np.random.rand(1, L)
  initial_mixing_coeffs /= np.sum(initial_mixing_coeffs)  # Normalize

  # Create a dictionary to store covariance matrices
  covariance_matrices = {i: initial_covs[i] for i in range(L)}

  # Train the GMM model using the EM algorithm
  trained_means, trained_covs, trained_mixing_coeffs = EM_training(
      mean=initial_means,
      mixing_coeffs=initial_mixing_coeffs,
      covs=covariance_matrices,
      dataset_input=X,
      num_iters=100,
      tol=0.0001
  )

  print("trained means are:"); print(trained_means)
  print("trained_covs are:");print(trained_covs)
  print("trained_mixing_coeffs are:");print(trained_mixing_coeffs)

  # Plot the original data
  plt.figure(figsize=(8, 6))
  plt.scatter(X[:, 0], X[:, 1], c='blue', label='make_moons dataset', alpha=0.5)


  # Plot the GMM model
  ###############################
  x, y = np.meshgrid(np.linspace(X[:, 0].min(), X[:, 0].max(), 100), np.linspace(X[:, 1].min(), X[:, 1].max(), 100))
  xy = np.column_stack([x.ravel(), y.ravel()])
  pdf_values = np.zeros_like(x)
  for idx in range(L):
      #component_pdf = trained_mixing_coeffs[0, idx] * multivariate_normal.pdf(xy, trained_means[idx, :], trained_covs[idx])
      component_pdf = trained_mixing_coeffs[0, idx] * gaussian_multivar(xy, trained_means[idx, :], trained_covs[idx])
      pdf_values += component_pdf.reshape(x.shape)

  # Create a contour plot for the GMM model with a color bar
  contour = plt.contourf(x, y, pdf_values, levels=10, cmap='viridis', alpha=0.5)
  colorbar = plt.colorbar(contour, label='Probability Density')

  plt.xlabel('Feature 1')
  plt.ylabel('Feature 2')
  plt.legend()
  plt.title('GMM Model and make_moons Dataset')
  plt.show()

  #plot_contour_plot_gmm(X,trained_mixing_coeffs,trained_means,trained_covs,L)



  return trained_means, trained_covs, trained_mixing_coeffs

We employed following strategy to sample from the GMM model:¶

In a Gaussian Mixture Model (GMM) the multiple Gaussian components represents a cluster or mode in your data. The "mixing coefficients" indicate the probability of choosing each component when generating data:

Step 1. Component Selection: First, one needs to select one of the Gaussian components to generate a data point. We do this by sampling from the mixing coefficients. For example, if we have two components with mixing coefficients [0.4, 0.6], we'd choose the first component with a 40% chance and the second component with a 60% chance.

Step.2. Sample from the Selected Component: Once we've selected a component, we sample a data point from that component's Gaussian distribution. This is done by generating random numbers according to the mean and covariance matrix of the selected component's Gaussian distribution.

Step 3: Repeat: We repeat these steps to generate as many data points as you need.

In [ ]:
# sample the data_points from the GMM model
def sample_from_gmm(num_samples, mixing_coeffs, mean_array, covs,X, want_scatter_plot=False):
  from scipy.stats import multivariate_normal

  samples = []
  for _ in range(num_samples):
      # Sample a component based on mixing coefficients
      component = np.random.choice(len(mixing_coeffs[0]), p=mixing_coeffs[0])

      # Sample from the selected component
      sample = np.random.multivariate_normal(mean_array[component, :], covs[component])
      samples.append(sample)
  samples=np.array(samples)

  if want_scatter_plot==True:
    plt.figure(2)
    plt.figure(figsize=(8, 6))
    plt.scatter(X[:, 0], X[:, 1], c='blue', label='make_moons dataset', alpha=0.5)
    plt.scatter(samples[:, 0], samples[:, 1], c='red', label='Generated Data', alpha=0.5)
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.legend()
    plt.title('Generated Data from GMM Model')
    plt.show()

  return samples
In [ ]:
list(datasets.keys())
Out[ ]:
['Small', 'Medium', 'Large']
In [ ]:
### fit the GMM model to the data and then sample

dataset_for_GMM={}; keys_dataset=list(datasets.keys())
for data_size in keys_dataset:
  print("----------------DATASIZE:",data_size,"------------------")
  print("_________________________________________________________")
  X=datasets[data_size];temp={}
  for idx_el in range(1,7):
    print("index_el:",idx_el)
    trained_means, trained_covs, trained_mixing_coeffs=fit_the_GMM(X=X, L=idx_el)

    generated_samples=sample_from_gmm(num_samples=X.shape[0],mixing_coeffs=trained_mixing_coeffs,
                                          mean_array=trained_means,
                                          covs=trained_covs,
                                      X=X, want_scatter_plot=True)
    temp[idx_el]=generated_samples
  dataset_for_GMM[data_size]=temp
----------------DATASIZE: Small ------------------
_________________________________________________________
index_el: 1
L is:
1
trained means are:
[[0.50006995 0.2464502 ]]
trained_covs are:
{0: array([[ 0.753472  , -0.17935397],
       [-0.17935397,  0.24466648]])}
trained_mixing_coeffs are:
[[1.]]
<Figure size 640x480 with 0 Axes>
index_el: 2
L is:
2
trained means are:
[[ 0.80769015  0.17751047]
 [-0.74562807  0.52561939]]
trained_covs are:
{0: array([[ 0.44948797, -0.13198018],
       [-0.13198018,  0.25881563]]), 1: array([[0.04948005, 0.06244601],
       [0.06244601, 0.09018868]])}
trained_mixing_coeffs are:
[[0.80195932 0.19804068]]
<Figure size 640x480 with 0 Axes>
index_el: 3
L is:
3
trained means are:
[[ 0.71459993  0.281603  ]
 [ 0.61840201  0.75452719]
 [ 0.40285576 -0.01647488]]
trained_covs are:
{0: array([[ 0.91569742, -0.24268738],
       [-0.24268738,  0.1313912 ]]), 1: array([[ 0.60645975, -0.18408576],
       [-0.18408576,  0.06919984]]), 2: array([[ 0.77493737, -0.22318156],
       [-0.22318156,  0.15328593]])}
trained_mixing_coeffs are:
[[0.10380025 0.30088703 0.59531273]]
<Figure size 640x480 with 0 Axes>
index_el: 4
L is:
4
trained means are:
[[-0.74373538  0.52724569]
 [ 0.49212539 -0.00998175]
 [ 0.78034677  0.723156  ]
 [ 1.50025431 -0.31479398]]
trained_covs are:
{0: array([[0.04925169, 0.06251597],
       [0.06251597, 0.09093562]]), 1: array([[ 0.14582385, -0.03125686],
       [-0.03125686,  0.12055593]]), 2: array([[ 0.57467376, -0.19671174],
       [-0.19671174,  0.07751427]]), 3: array([[0.08423101, 0.04730756],
       [0.04730756, 0.03251709]])}
trained_mixing_coeffs are:
[[0.2055128  0.33860229 0.27456441 0.18132049]]
<Figure size 640x480 with 0 Axes>
index_el: 5
L is:
5
trained means are:
[[ 0.69253483 -0.09092805]
 [ 0.4064456   0.57350401]
 [ 1.81651015  0.01400419]
 [-0.70462613  0.60873619]
 [-0.57022529  0.47183015]]
trained_covs are:
{0: array([[ 0.1882859 , -0.07826564],
       [-0.07826564,  0.18301614]]), 1: array([[ 0.11998602, -0.02308836],
       [-0.02308836,  0.17391539]]), 2: array([[0.02825188, 0.04299729],
       [0.04299729, 0.07831436]]), 3: array([[0.05003109, 0.05501902],
       [0.05501902, 0.06405516]]), 4: array([[0.18140737, 0.19641821],
       [0.19641821, 0.21639144]])}
trained_mixing_coeffs are:
[[0.34796375 0.24753802 0.17313393 0.17922193 0.05214237]]
<Figure size 640x480 with 0 Axes>
index_el: 6
L is:
6
trained means are:
[[-0.73627212  0.53576089]
 [ 0.04559832  0.22281345]
 [ 0.65519663  0.6311833 ]
 [ 1.78714662 -0.01732783]
 [ 0.71546059 -0.38751407]
 [-0.10457785  0.97105925]]
trained_covs are:
{0: array([[0.05094303, 0.06435182],
       [0.06435182, 0.09268741]]), 1: array([[ 0.00268605, -0.00784069],
       [-0.00784069,  0.03525823]]), 2: array([[ 0.09458215, -0.09117792],
       [-0.09117792,  0.10940841]]), 3: array([[0.03685224, 0.0502607 ],
       [0.0502607 , 0.08314315]]), 4: array([[ 0.12809965, -0.04311216],
       [-0.04311216,  0.01943504]]), 5: array([[0.0019856 , 0.00160915],
       [0.00160915, 0.00448503]])}
trained_mixing_coeffs are:
[[0.21003668 0.09719611 0.2520742  0.18888608 0.21205136 0.03975557]]
<Figure size 640x480 with 0 Axes>
----------------DATASIZE: Medium ------------------
_________________________________________________________
index_el: 1
L is:
1
trained means are:
[[0.49985466 0.24700133]]
trained_covs are:
{0: array([[ 0.74904345, -0.19119175],
       [-0.19119175,  0.2433779 ]])}
trained_mixing_coeffs are:
[[1.]]
<Figure size 640x480 with 0 Axes>
index_el: 2
L is:
2
trained means are:
[[0.44917222 0.04491844]
 [0.62286977 0.73749168]]
trained_covs are:
{0: array([[ 0.81234166, -0.228497  ],
       [-0.228497  ,  0.17381113]]), 1: array([[ 0.57404025, -0.18584238],
       [-0.18584238,  0.07252806]])}
trained_mixing_coeffs are:
[[0.7082144 0.2917856]]
<Figure size 640x480 with 0 Axes>
index_el: 3
L is:
3
trained means are:
[[-0.7737834   0.51975744]
 [ 1.76511022 -0.04300051]
 [ 0.49330375  0.25437384]]
trained_covs are:
{0: array([[0.040528  , 0.05267169],
       [0.05267169, 0.08300167]]), 1: array([[0.04383146, 0.0543914 ],
       [0.0543914 , 0.08102181]]), 2: array([[ 0.18834427, -0.12138886],
       [-0.12138886,  0.29419402]])}
trained_mixing_coeffs are:
[[0.18909945 0.19354863 0.61735192]]
<Figure size 640x480 with 0 Axes>
index_el: 4
L is:
4
trained means are:
[[ 0.48864373 -0.18266959]
 [-0.69458279  0.58416462]
 [ 1.75212006 -0.0556088 ]
 [ 0.57128862  0.66601923]]
trained_covs are:
{0: array([[ 0.17184121, -0.11440647],
       [-0.11440647,  0.10408415]]), 1: array([[0.06983778, 0.07260714],
       [0.07260714, 0.09321453]]), 2: array([[0.04731459, 0.05721665],
       [0.05721665, 0.08278782]]), 3: array([[ 0.13389047, -0.10183705],
       [-0.10183705,  0.09826414]])}
trained_mixing_coeffs are:
[[0.30155874 0.22387285 0.20060659 0.27396182]]
<Figure size 640x480 with 0 Axes>
index_el: 5
L is:
5
trained means are:
[[ 0.8985737  -0.45494526]
 [ 1.00047115  0.27866819]
 [ 0.26990034  0.8767014 ]
 [-0.77334284  0.52525856]
 [ 1.00370896 -0.17079393]]
trained_covs are:
{0: array([[ 0.09583784, -0.01040648],
       [-0.01040648,  0.00459743]]), 1: array([[ 0.61772248, -0.02023903],
       [-0.02023903,  0.02871827]]), 2: array([[ 0.12956334, -0.03827076],
       [-0.03827076,  0.01906512]]), 3: array([[0.03871495, 0.05015285],
       [0.05015285, 0.0782377 ]]), 4: array([[ 0.50148919, -0.03404347],
       [-0.03404347,  0.02566867]])}
trained_mixing_coeffs are:
[[0.15765052 0.24341738 0.21396886 0.18540784 0.1995554 ]]
<Figure size 640x480 with 0 Axes>
index_el: 6
L is:
6
trained means are:
[[-0.79592265  0.49642325]
 [ 0.82699921  0.4705127 ]
 [ 0.04192508  0.95596168]
 [ 1.81433009  0.00769911]
 [ 0.9882576  -0.46320741]
 [ 0.18051161  0.02682711]]
trained_covs are:
{0: array([[0.03271249, 0.04497008],
       [0.04497008, 0.07611623]]), 1: array([[ 0.02727174, -0.03769265],
       [-0.03769265,  0.06601615]]), 2: array([[ 0.0738626 , -0.00207535],
       [-0.00207535,  0.00401852]]), 3: array([[0.02719964, 0.038613  ],
       [0.038613  , 0.06770735]]), 4: array([[ 0.08176539, -0.00181037],
       [-0.00181037,  0.00338919]]), 5: array([[ 0.02878055, -0.04141316],
       [-0.04141316,  0.07297738]])}
trained_mixing_coeffs are:
[[0.18017456 0.16739798 0.15220133 0.17229991 0.15677852 0.17114771]]
<Figure size 640x480 with 0 Axes>
----------------DATASIZE: Large ------------------
_________________________________________________________
index_el: 1
L is:
1
trained means are:
[[0.5006108  0.24937081]]
trained_covs are:
{0: array([[ 0.75409776, -0.19372119],
       [-0.19372119,  0.24760576]])}
trained_mixing_coeffs are:
[[1.]]
<Figure size 640x480 with 0 Axes>
index_el: 2
L is:
2
trained means are:
[[ 0.62829721  0.09056855]
 [-0.0507244   0.93506067]]
trained_covs are:
{0: array([[ 0.81228134, -0.13225792],
       [-0.13225792,  0.16964158]]), 1: array([[0.12849924, 0.00648573],
       [0.00648573, 0.00518643]])}
trained_mixing_coeffs are:
[[0.81195531 0.18804469]]
<Figure size 640x480 with 0 Axes>
index_el: 3
L is:
3
trained means are:
[[ 1.74385897 -0.06299622]
 [-0.76689092  0.54119564]
 [ 0.48653496  0.26148846]]
trained_covs are:
{0: array([[0.05194519, 0.06166023],
       [0.06166023, 0.09103468]]), 1: array([[0.04516723, 0.05632036],
       [0.05632036, 0.08569044]]), 2: array([[ 0.17811734, -0.10976195],
       [-0.10976195,  0.29223171]])}
trained_mixing_coeffs are:
[[0.20444224 0.19384816 0.60170959]]
<Figure size 640x480 with 0 Axes>
index_el: 4
L is:
4
trained means are:
[[ 1.81235615  0.00383034]
 [ 0.21044536  0.12828942]
 [ 0.77699853  0.37363025]
 [-0.81836973  0.48956207]]
trained_covs are:
{0: array([[0.0286931 , 0.04150261],
       [0.04150261, 0.07661638]]), 1: array([[ 0.14192445, -0.18107452],
       [-0.18107452,  0.28678854]]), 2: array([[ 0.14993125, -0.18741391],
       [-0.18741391,  0.29675392]]), 3: array([[0.02890473, 0.04178497],
       [0.04178497, 0.07497171]])}
trained_mixing_coeffs are:
[[0.17381666 0.32509748 0.33048881 0.17059706]]
<Figure size 640x480 with 0 Axes>
index_el: 5
L is:
5
trained means are:
[[ 1.81236186  0.00516478]
 [ 0.619187    0.64041602]
 [-0.76123566  0.54781842]
 [ 0.08324945  0.24193606]
 [ 0.93948828 -0.45298527]]
trained_covs are:
{0: array([[0.02838007, 0.04070096],
       [0.04070096, 0.07516323]]), 1: array([[ 0.11794533, -0.10043192],
       [-0.10043192,  0.10747158]]), 2: array([[0.04568965, 0.0567913 ],
       [0.0567913 , 0.08589983]]), 3: array([[ 0.03743876, -0.07661123],
       [-0.07661123,  0.19122726]]), 4: array([[ 0.09528433, -0.00751979],
       [-0.00751979,  0.00424314]])}
trained_mixing_coeffs are:
[[0.17411171 0.26193946 0.19831888 0.19815708 0.16747287]]
<Figure size 640x480 with 0 Axes>
index_el: 6
L is:
6
trained means are:
[[ 0.26756185 -0.07134769]
 [-0.86519315  0.42787887]
 [ 1.68143494 -0.11412851]
 [ 0.94456531 -0.05292318]
 [ 0.58870719  0.76962196]
 [-0.19904052  0.94382425]]
trained_covs are:
{0: array([[ 0.05511159, -0.06300095],
       [-0.06300095,  0.0884764 ]]), 1: array([[0.01717837, 0.02795799],
       [0.02795799, 0.06008518]]), 2: array([[0.0742036 , 0.07681313],
       [0.07681313, 0.09843615]]), 3: array([[0.0076563 , 0.00114038],
       [0.00114038, 0.14426678]]), 4: array([[ 0.04412181, -0.03133136],
       [-0.03133136,  0.02659675]]), 5: array([[0.06481334, 0.01158746],
       [0.01158746, 0.00533173]])}
trained_mixing_coeffs are:
[[0.21219401 0.14627734 0.23401158 0.13300356 0.13446874 0.14004476]]
<Figure size 640x480 with 0 Axes>
In [ ]:
# with open('GMM_dataset_dictionary.pkl', 'wb') as f:
#     pickle.dump(dataset_for_GMM, f)

Observations:

  1. For the given data-set, increasing the number of mixers in our GMM model helps in learning the distribution, something we would expect as the dataset is clearly not uni-modal so to speak.

  2. As can clearly be seen, for a particular L (number of mixer components in the GMM), more the number of dataset, better is the estimate of the probability. With larger dataset, a GMM model of low value of L (less number of mixtures) is able to model the distribution as good as a GMM model with large l trained over low number of data-points.

This can also be seen more explicitly from the plot of $mmd^{2}$ for different L for each dataset of different sizes ('small':100 data points, 'medium':500 data points, 'high':1000 datapoints) plotted below!

1.4 Kernel density estimation.¶

Note that the kernel density estimation model is a non-parametric model, hence it does not require any "training" so to speak!

  • discusses how to choose an appropriate bandwidth parameter: https://stats.libretexts.org/Bookshelves/Computing_and_Modeling/Supplemental_Modules_(Computing_and_Modeling)/Regression_Analysis/Nonparametric_Inference_-_Kernel_Density_Estimation#:~:text=Contributors-,Introduction%20and%20definition,%E2%88%88%CE%98%E2%8A%82Rd%7D.

  • Squared exponential kernel:

https://www.mathworks.com/help/stats/kernel-covariance-function-options.html

In [ ]:
def squared_exp_kernel(xa:np.array, xb:np.array,bandwidth:float,amplitude:float=1):
  '''
  xa=row vector of shape (1xdimn_of_the_prob)
  xb= row vector of shape (1X dimn_of_the_prob)
  bandwidth: a hyperparameter
  amplitude: overall variance (set to 1 in the lecture, can be changed as well)
  '''
  diff=xa-xb
  kernel=(amplitude**2) *  np.exp( -np.dot(diff,diff)/(2*bandwidth) )### what we are calling bandwidth is also refered to as sigma^2 is some of the literature. Here we have followed the notation followed in the lecture
  return kernel

def inverse_multi_quadratic_kernel(xa:np.array, xb:np.array,bandwidth:float):
  '''
  xa=row vector of shape (1xdimn_of_the_prob)
  xb= row vector of shape (1X dimn_of_the_prob)
  bandwidth: a hyperparameter
  '''
  diff=xa-xb
  kernel=bandwidth/(bandwidth+ ((norm(diff))**2) )
  return kernel

def kernel_density_estimator(x:np.array, dataset:np.array, bandwidth:float, kernel_fn):
  '''
  x: a row vector (1X dimn of the problem) at which one wants to evaluate the KDE (prob function)
  dataset= an array of shape (N X dimn of the problem)
  bandwidth: hyperparameter
  kernel_fn: choice of the user
  '''
  num_data=np.shape(dataset)[0]
  list_kernel=[kernel_fn(xa=x,xb=dataset[j,:],bandwidth=bandwidth) for j in range(0,num_data)]
  prob=(1./num_data)*sum(list_kernel)## sometimes instead of 1./num_data , we also have 1./(bandwidth X num_data)
  return prob
In [ ]:
def estimate_bandwidth_with_silverman_rule(data):
    # Step 1: Calculate the standard deviation
    std_dev = np.std(data)

    # Step 2: Calculate the interquartile range (IQR)
    Q1 = np.percentile(data, 25, axis=0)
    Q3 = np.percentile(data, 75, axis=0)
    IQR = Q3 - Q1

    # Step 3: Estimate the bandwidth using Silverman's rule of thumb
    n = len(data)
    bandwidth = 0.9 * min(std_dev, np.mean(IQR) / 1.34) * n ** (-1 / 5)

    return bandwidth
In [ ]:
# contour plot kde
def contour_plot_kde(X,bandwidth,xmin=-2,xmax=2,ymin=-1.5,ymax=1.5):

  # # Create a grid of points within the specified range
  x_grid = np.linspace(xmin, xmax, 100)
  y_grid = np.linspace(ymin, ymax, 100)
  X_grid, Y_grid = np.meshgrid(x_grid, y_grid)
  points = np.c_[X_grid.ravel(), Y_grid.ravel()]

  # Evaluate the KDE at each point in the grid
  kde_values = np.array([kernel_density_estimator(point, X, bandwidth,kernel_fn=squared_exp_kernel) for point in points])
  kde_values = kde_values.reshape(X_grid.shape)

  # Create a contour plot
  plt.figure(figsize=(8, 6))
  plt.contourf(X_grid, Y_grid, kde_values, levels=100, cmap='viridis')
  plt.scatter(X[:, 0], X[:, 1], c='blue', label='make_moons dataset', alpha=0.5)
  plt.colorbar(label='Probability Density')
  plt.xlabel('Feature 1')
  plt.ylabel('Feature 2')
  plt.title('Scatter Plot for data and Contour Plot of KDE Probability Density')
  plt.legend()
  plt.show()

# contour plot for distribution from the KDE
# xmin, xmax = -1, 2  # Adjust the range as needed
# ymin, ymax = -1, 1  # Adjust the range as needed
# # bandwidth =  estimate_bandwidth_with_silverman_rule(X)
# # print("Estimated bandwidth using silverman rule:", bandwidth)
# contour_plot_kde(X,bandwidth=0.1,xmin=-1,xmax=2,ymin=-1,ymax=1)
In [ ]:
### Sampling from the KDE
def sample_from_kde(data, bandwidth, num_samples,want_scatter_plot=False):
  samples = []
  for _ in range(num_samples):
      # Randomly select a data point
      data_point = data[np.random.randint(len(data))]

      # Generate random perturbations based on the Gaussian kernel
      perturbations = np.random.normal(0, bandwidth, size=data_point.shape)

      # Add the perturbations to the selected data point
      sample = data_point + perturbations
      samples.append(sample)

  samples=np.array(samples)

  if want_scatter_plot==True:
    # Sample from the KDE
    # visualise the original and the generated data
    plt.figure(figsize=(8,6))
    plt.scatter(X[:, 0], X[:, 1,], c='b', label='Original Data')
    plt.scatter(samples[:, 0], samples[:, 1,], c='r', label='Generated Data')
    plt.legend()
    plt.tight_layout()
    plt.show()

  return samples
In [ ]:
dataset_kde={};bandwidth=[0.01,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4];keys_dataset=list(datasets.keys())
keys_dataset=list(datasets.keys())
for data_size in keys_dataset:
  print("--------data_size:",data_size)
  X=datasets[data_size];temp={}
  for bw in bandwidth:
    print("bandwidth:",bw)
    contour_plot_kde(X,bandwidth=bw,xmin=-2,xmax=2.5,ymin=-1.5,ymax=1.5)
    generated_data_KDE=sample_from_kde(data=X,bandwidth=bw,num_samples=X.shape[0],want_scatter_plot=True)
    temp[bw]=generated_data_KDE
  dataset_kde[data_size]=temp
--------data_size: Small
bandwidth: 0.01
bandwidth: 0.05
bandwidth: 0.1
bandwidth: 0.15
bandwidth: 0.2
bandwidth: 0.25
bandwidth: 0.3
bandwidth: 0.35
bandwidth: 0.4
--------data_size: Medium
bandwidth: 0.01
bandwidth: 0.05
bandwidth: 0.1
bandwidth: 0.15
bandwidth: 0.2
bandwidth: 0.25
bandwidth: 0.3
bandwidth: 0.35
bandwidth: 0.4
--------data_size: Large
bandwidth: 0.01
bandwidth: 0.05
bandwidth: 0.1
bandwidth: 0.15
bandwidth: 0.2
bandwidth: 0.25
bandwidth: 0.3
bandwidth: 0.35
bandwidth: 0.4
In [ ]:
# with open('KDE_dataset_dictionary_smaller_bw_range.pkl', 'wb') as f:
#     pickle.dump(dataset_kde, f)

Implementation of MMD metric:¶

a good resource for MMD: https://www.onurtunali.com/ml/2019/03/08/maximum-mean-discrepancy-in-machine-learning.html

In [ ]:
import numpy as np

def estimate_mmd_squared(samples1, samples2, kernel_function, bandwidth=1.0):
    """
    Estimate the Maximum Mean Discrepancy (MMD) squared between two sets of samples using a kernel function.

    Args:
    - samples1: the first set of samples.
                A numpy array of shape (n_samples1, n_features) representing .
    - samples2: the second set of samples.
                A numpy array of shape (n_samples2, n_features) representing .
    - kernel_function: A function that computes the kernel value between two samples.
    - bandwidth: Bandwidth parameter for the kernel function.

    Returns:
    - mmd_squared: The estimated MMD squared.
    """
    num_samples1 = samples1.shape[0]
    num_samples2 = samples2.shape[0]

    # Compute the MMD squared statistic
    mmd_squared1,mmd_squared2,mmd_squared12 = 0.0,0.0,0.0

    for i in range(num_samples1):
        for j in range(num_samples1):
            mmd_squared1 += kernel_function(samples1[i], samples1[j], bandwidth)
    mmd_squared1 *= 1.0 / (num_samples1 * (num_samples1 - 1))

    for i in range(num_samples2):
        for j in range(num_samples2):
            mmd_squared2 += kernel_function(samples2[i], samples2[j], bandwidth)
    mmd_squared2 *= 1.0 / (num_samples2 * (num_samples2 - 1))

    for i in range(num_samples1):
        for j in range(num_samples2):
            mmd_squared12 -= 2.0 * kernel_function(samples1[i], samples2[j], bandwidth)
    mmd_squared12 *= 1.0 / (num_samples1 * (num_samples2))

    mmd_squared=mmd_squared1+mmd_squared2+mmd_squared12


    return mmd_squared

def squared_exp_kernel(xa, xb, bandwidth, amplitude=1.0):
    """
    Squared Exponential (RBF) kernel function.

    Args:
    - xa: A numpy array of shape (1, dimn_of_the_prob).
    - xb: A numpy array of shape (1, dimn_of_the_prob).
    - bandwidth: Bandwidth parameter.
    - amplitude: Overall variance (default is 1).

    Returns:
    - kernel: The kernel value.
    """
    diff = xa - xb
    kernel = (amplitude ** 2) * np.exp(-np.dot(diff, diff) / (2 * bandwidth))
    return kernel

def inverse_multi_quadratic_kernel(xa, xb, bandwidth):
    """
    Inverse Multi-Quadratic kernel function.

    Args:
    - xa: A numpy array of shape (1, dimn_of_the_prob).
    - xb: A numpy array of shape (1, dimn_of_the_prob).
    - bandwidth: Bandwidth parameter.

    Returns:
    - kernel: The kernel value.
    """
    diff = xa - xb
    kernel = bandwidth / (bandwidth + np.sum(diff ** 2))
    return kernel
In [ ]:
def plot_dict_of_dicts(data, x_label='X-values', y_label='Y-values', title='Two-Line Point Plot'):
    """
    Plot a dictionary of dictionaries as separate lines on a two-line point plot.

    Args:
        data (dict): A dictionary where keys represent labels and values are dictionaries with x and y values.
        x_label (str): Label for the x-axis.
        y_label (str): Label for the y-axis.
        title (str): Title for the plot.

    Returns:
        None
    """
    plt.figure(figsize=(8, 6))

    for label, values in data.items():
        x_values = list(values.keys())
        y_values = list(values.values())
        plt.plot(x_values, y_values,":" ,marker='o', label=label)

    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)
    plt.legend()
    plt.grid(True)
    plt.show()

Using squared exponential kernel to estimate the MMD.¶

evaluating mmds for GMMs using squared exponential kernel¶

In [ ]:
size_dataset=["Small","Medium","Large"];mmd_gmm_model_sqd_exp_kernel={}
for data_size in size_dataset:
  print("----------------DATASIZE:",data_size,"------------------")
  X=datasets[data_size];
  temp={}
  for idx_el in range(1,7):
    dataset_sampled_gmm=dataset_for_GMM[data_size][idx_el]
    temp[idx_el]=estimate_mmd_squared(samples1=dataset_sampled_gmm,
                                           samples2=X, kernel_function=squared_exp_kernel)
    print("for L=",idx_el," MMD^2 score:");print(temp[idx_el])
  mmd_gmm_model_sqd_exp_kernel[data_size]=temp
----------------DATASIZE: Small ------------------
for L= 1  MMD^2 score:
0.021462996536169454
for L= 2  MMD^2 score:
0.015153576109734779
for L= 3  MMD^2 score:
0.023083391802166675
for L= 4  MMD^2 score:
0.0157404628433937
for L= 5  MMD^2 score:
0.012968477216233731
for L= 6  MMD^2 score:
0.011881913431552427
----------------DATASIZE: Medium ------------------
for L= 1  MMD^2 score:
0.00808000119872232
for L= 2  MMD^2 score:
0.007847838415810715
for L= 3  MMD^2 score:
0.002854229711848566
for L= 4  MMD^2 score:
0.0026202490229276787
for L= 5  MMD^2 score:
0.0030813828475186877
for L= 6  MMD^2 score:
0.0022849453894315808
----------------DATASIZE: Large ------------------
for L= 1  MMD^2 score:
0.008141649846514776
for L= 2  MMD^2 score:
0.0035073972497046135
for L= 3  MMD^2 score:
0.0015671430299024625
for L= 4  MMD^2 score:
0.0019851960641910082
for L= 5  MMD^2 score:
0.0013715822944770917
for L= 6  MMD^2 score:
0.0015351823895199956
In [ ]:
plot_dict_of_dicts(data=mmd_gmm_model_sqd_exp_kernel,
                   x_label="L values",
                   y_label="MMD_squared",
                   title="MMD_squared (with sqd_exp kernel) plot for GMM for dataset of different lengths ")

evaluating mmds for KDEs using squared exponential kernel¶

In [ ]:
mmd_kde_model_sqd_exp_kernel={}
bandwidth_for_kde=[0.01,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4];keys_dataset=list(datasets.keys())
size_dataset=["Small","Medium","Large"]
for data_size in size_dataset:
  print("--------Data_size:",data_size,"-----------")
  X=datasets[data_size];temp={}
  for bw in bandwidth_for_kde:
    dataset_sampled_kde=dataset_kde[data_size][bw]
    temp[bw]=estimate_mmd_squared(samples1=dataset_sampled_kde,
                                           samples2=X, kernel_function=squared_exp_kernel)
    print("for bandwidth:",bw," MMD^2 score:",temp[bw])
  mmd_kde_model_sqd_exp_kernel[data_size]=temp
--------Data_size: Small -----------
for bandwidth: 0.01  MMD^2 score: 0.0227324569272509
for bandwidth: 0.05  MMD^2 score: 0.014870248884561788
for bandwidth: 0.1  MMD^2 score: 0.0152579907442425
for bandwidth: 0.15  MMD^2 score: 0.013652325888171557
for bandwidth: 0.2  MMD^2 score: 0.014246454194056346
for bandwidth: 0.25  MMD^2 score: 0.013828091300571055
for bandwidth: 0.3  MMD^2 score: 0.018295668526015052
for bandwidth: 0.35  MMD^2 score: 0.013389361599205007
for bandwidth: 0.4  MMD^2 score: 0.01629857467187279
--------Data_size: Medium -----------
for bandwidth: 0.01  MMD^2 score: 0.004230828587042268
for bandwidth: 0.05  MMD^2 score: 0.00229779593652113
for bandwidth: 0.1  MMD^2 score: 0.0034189463359119987
for bandwidth: 0.15  MMD^2 score: 0.0030637120826850772
for bandwidth: 0.2  MMD^2 score: 0.0037768409473978304
for bandwidth: 0.25  MMD^2 score: 0.004494465353473864
for bandwidth: 0.3  MMD^2 score: 0.004621511270823331
for bandwidth: 0.35  MMD^2 score: 0.007253995535964286
for bandwidth: 0.4  MMD^2 score: 0.00628754405726728
--------Data_size: Large -----------
for bandwidth: 0.01  MMD^2 score: 0.0016516745432757496
for bandwidth: 0.05  MMD^2 score: 0.0028152656201276827
for bandwidth: 0.1  MMD^2 score: 0.001332365306855099
for bandwidth: 0.15  MMD^2 score: 0.0015215272280293402
for bandwidth: 0.2  MMD^2 score: 0.0026557089785370636
for bandwidth: 0.25  MMD^2 score: 0.002927438603620436
for bandwidth: 0.3  MMD^2 score: 0.003200250134513638
for bandwidth: 0.35  MMD^2 score: 0.004580846148862894
for bandwidth: 0.4  MMD^2 score: 0.006475937707462465
In [ ]:
plot_dict_of_dicts(data=mmd_kde_model_sqd_exp_kernel,
                   x_label="bandwidth (h) for KDE",
                   y_label="MMD_squared",
                   title="MMD_squared (with sqd_exp kernel) plot for KDE for dataset of different lengths ")

Evaluating MMDS using inverse multi-quadratic fn¶

for GMM

In [ ]:
size_dataset=["Small","Medium","Large"];mmd_gmm_model_inv_quad={}
for data_size in size_dataset:
  print("----------------DATASIZE:",data_size,"------------------")
  X=datasets[data_size];
  temp={}
  for idx_el in range(1,7):
    print("L:",idx_el)
    dataset_sampled_gmm=dataset_for_GMM[data_size][idx_el]
    temp[idx_el]=estimate_mmd_squared(samples1=dataset_sampled_gmm,
                                           samples2=X, kernel_function=inverse_multi_quadratic_kernel)
    print("For L=",idx_el," MMD^2 score:",temp[idx_el])
  mmd_gmm_model_inv_quad[data_size]=temp
----------------DATASIZE: Small ------------------
L: 1
For L= 1  MMD^2 score: 0.02522100706992847
L: 2
For L= 2  MMD^2 score: 0.019936303817976953
L: 3
For L= 3  MMD^2 score: 0.02869786973074495
L: 4
For L= 4  MMD^2 score: 0.015868479819498127
L: 5
For L= 5  MMD^2 score: 0.01374438812674006
L: 6
For L= 6  MMD^2 score: 0.012066812679013106
----------------DATASIZE: Medium ------------------
L: 1
For L= 1  MMD^2 score: 0.013191802458515478
L: 2
For L= 2  MMD^2 score: 0.012626625454044249
L: 3
For L= 3  MMD^2 score: 0.0049995159821693
L: 4
For L= 4  MMD^2 score: 0.0032303076786441842
L: 5
For L= 5  MMD^2 score: 0.005232252075411892
L: 6
For L= 6  MMD^2 score: 0.002370310013506516
----------------DATASIZE: Large ------------------
L: 1
For L= 1  MMD^2 score: 0.01372832858219697
L: 2
For L= 2  MMD^2 score: 0.00682231846961745
L: 3
For L= 3  MMD^2 score: 0.003644204407087237
L: 4
For L= 4  MMD^2 score: 0.004022457051592765
L: 5
For L= 5  MMD^2 score: 0.0018976849608979274
L: 6
For L= 6  MMD^2 score: 0.0017011299403707492
In [ ]:
plot_dict_of_dicts(data=mmd_gmm_model_inv_quad,
                   x_label="L values",
                   y_label="MMD_squared",
                   title="MMD_squared (with inv-multi-quad kernel) plot for GMM for dataset of different lengths")

for KDE:

In [ ]:
mmd_kde_model_inv_quad={}
bandwidth_for_kde=[0.01,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4];keys_dataset=list(datasets.keys())
size_dataset=["Small","Medium","Large"]
for data_size in size_dataset:
  print("--------Data_size:",data_size,"-----------")
  X=datasets[data_size];temp={}
  for bw in bandwidth_for_kde:
    dataset_sampled_kde=dataset_kde[data_size][bw]
    temp[bw]=estimate_mmd_squared(samples1=dataset_sampled_kde,
                                           samples2=X, kernel_function=inverse_multi_quadratic_kernel)
    print("for bandwidth:",bw," MMD^2 score:",temp[bw])
  mmd_kde_model_inv_quad[data_size]=temp
--------Data_size: Small -----------
for bandwidth: 0.01  MMD^2 score: 0.021025600080310625
for bandwidth: 0.05  MMD^2 score: 0.01418040320122893
for bandwidth: 0.1  MMD^2 score: 0.01425900388764445
for bandwidth: 0.15  MMD^2 score: 0.013629291631276352
for bandwidth: 0.2  MMD^2 score: 0.014214256636971845
for bandwidth: 0.25  MMD^2 score: 0.015443687141364015
for bandwidth: 0.3  MMD^2 score: 0.01853420968861652
for bandwidth: 0.35  MMD^2 score: 0.014411663277056563
for bandwidth: 0.4  MMD^2 score: 0.017518280459942748
--------Data_size: Medium -----------
for bandwidth: 0.01  MMD^2 score: 0.004301337898196311
for bandwidth: 0.05  MMD^2 score: 0.0023686525534216685
for bandwidth: 0.1  MMD^2 score: 0.003316046723267796
for bandwidth: 0.15  MMD^2 score: 0.0034443858511293834
for bandwidth: 0.2  MMD^2 score: 0.00376625869288183
for bandwidth: 0.25  MMD^2 score: 0.004871164915991488
for bandwidth: 0.3  MMD^2 score: 0.006049072647813292
for bandwidth: 0.35  MMD^2 score: 0.007820356559834263
for bandwidth: 0.4  MMD^2 score: 0.008314072709681763
--------Data_size: Large -----------
for bandwidth: 0.01  MMD^2 score: 0.0015842704000799124
for bandwidth: 0.05  MMD^2 score: 0.0024984858386801756
for bandwidth: 0.1  MMD^2 score: 0.0013548147448194658
for bandwidth: 0.15  MMD^2 score: 0.0017721989809129646
for bandwidth: 0.2  MMD^2 score: 0.0028602606360367266
for bandwidth: 0.25  MMD^2 score: 0.0034149266742496964
for bandwidth: 0.3  MMD^2 score: 0.004455007301260472
for bandwidth: 0.35  MMD^2 score: 0.005633445445235363
for bandwidth: 0.4  MMD^2 score: 0.007897413444463042
In [ ]:
plot_dict_of_dicts(data=mmd_kde_model_inv_quad,
                   x_label="bandwidth (h) for KDE",
                   y_label="MMD_squared",
                   title="MMD^{squared} (with inverse_multi_quadratic_kernel) for KDE for dataset of different lengths ")

Generic observations:¶

  1. In the case of both GMMs with small L (number of mixtures) and KDE with otherwise not good bandwidth (>0.25 in our case), having large number of data-points is favourable!

  2. We expected (and as also can be seen from the plots above) the single gaussian to act the worse (as can be seen from l=1 case in the GMM) as our dataset is quite 'multimodal' so to speak which it clearly cannot capture.

2. Higher-dimensional data¶

Loading and splitting the data¶

In [ ]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_digits
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split

Let's load the dataset and split in train and test set and analyze how well the digits are distributed:

In [ ]:
digits = load_digits()
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.2)

# Count the frequency of each number from 0 to 9 in y_pred_gauss
count = np.bincount(y_train)

# Create a bar chart to visualize the frequencies
plt.bar(range(10), count)
plt.xticks(range(10))
plt.xlabel("Predicted Numbers")
plt.ylabel("Frequency")
plt.title("Frequency of Predicted Numbers")
plt.show()

We see, that the digits are in evenly distributed frequency in the train dataset

Define Maximum Mean Discrepancy¶

In [ ]:
def MMD(x, y, kernel):
    """Emprical maximum mean discrepancy. The lower the result
       the more evidence that distributions are the same.

    Args:
        x: first sample, distribution P
        y: second sample, distribution Q
        kernel: kernel type such as "rbf" or "inverse-multi-quadratic"
    """
    # Compute squared Euclidean distances

    xx, yy, zz = np.dot(x, x.T), np.dot(y, y.T), np.dot(x, y.T)
    rx = np.diag(xx).reshape(1, -1).repeat(xx.shape[0], axis=0)
    ry = np.diag(yy).reshape(1, -1).repeat(yy.shape[0], axis=0)

    dxx = rx.T + rx - 2. * xx  # Used for A in (1)
    dyy = ry.T + ry - 2. * yy  # Used for B in (1)
    dxy = rx.T + ry - 2. * zz  # Used for C in (1)

    XX, YY, XY = (np.zeros_like(xx),
                  np.zeros_like(xx),
                  np.zeros_like(xx))

    # Squared Exponential or Gaussian Kernel
    if kernel == "rbf":
        #bandwidth_range = [10, 15, 20, 50,100]
        bandwidth_range = [10, 15, 20, 25, 30]
        XX = sum(np.exp(-0.5 * dxx / a) for a in bandwidth_range)
        YY = sum(np.exp(-0.5 * dyy / a) for a in bandwidth_range)
        XY = sum(np.exp(-0.5 * dxy / a) for a in bandwidth_range)

    # Inverse Multi-Quadratic Kernel
    if kernel == "inverse-multi-quadratic":
        #bandwidth_range = [10, 15, 20, 50,100]
        bandwidth_range = [10, 15, 20, 25, 30]
        XX = sum(1 / (dxx / a + 1) for a in bandwidth_range)
        YY = sum(1 / (dyy / a + 1) for a in bandwidth_range)
        XY = sum(1 / (dxy / a + 1) for a in bandwidth_range)

    return np.mean(XX + YY - 2. * XY)

Density Forest¶

As a first generative network we try to implement the Density Forest implemtation from https://github.com/kfritsch/density_forest

In [ ]:
from DensityForest import DensityForest
X = X_train
density_forest = DensityForest(n_estimators=20)
density_forest.fit(X)
0.9956248904177695
0.9871580187119091
0.9893861428450302
0.9755717732196794
0.9803251047036711
[2.04878899 1.83218451]
[[0.37904361 0.0067639 ]
 [0.0067639  0.724158  ]]
[8.12902327 7.97001579]
[[0.69730777 0.05799666]
 [0.05799666 0.41239611]]
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
c:\Users\luke\OneDrive\Dokumente\UniHeidelberg\Master\Semester3\Generative Neural Networks\code\Assignment_1\Copy_of_generative_baselines.ipynb Cell 61 line 4
      <a href='vscode-notebook-cell:/c%3A/Users/luke/OneDrive/Dokumente/UniHeidelberg/Master/Semester3/Generative%20Neural%20Networks/code/Assignment_1/Copy_of_generative_baselines.ipynb#Y114sZmlsZQ%3D%3D?line=1'>2</a> X = X_train
      <a href='vscode-notebook-cell:/c%3A/Users/luke/OneDrive/Dokumente/UniHeidelberg/Master/Semester3/Generative%20Neural%20Networks/code/Assignment_1/Copy_of_generative_baselines.ipynb#Y114sZmlsZQ%3D%3D?line=2'>3</a> density_forest = DensityForest(n_estimators=20)
----> <a href='vscode-notebook-cell:/c%3A/Users/luke/OneDrive/Dokumente/UniHeidelberg/Master/Semester3/Generative%20Neural%20Networks/code/Assignment_1/Copy_of_generative_baselines.ipynb#Y114sZmlsZQ%3D%3D?line=3'>4</a> density_forest.fit(X)

File c:\Users\luke\OneDrive\Dokumente\UniHeidelberg\Master\Semester3\Generative Neural Networks\code\Assignment_1\DensityForest.py:93, in DensityForest.fit(self, X)
     91 if (self.boostrap):
     92     tree_sample = X[get_bootstrap_indices(len(X), len(X))]
---> 93 tree.fit(tree_sample)
     95 # t_dict = {}
     96 # for node in tree.tree:
     97 #     if(not(node==0) and node.isLeaf):
     98 #         t_dict[node.pointer] = [node.mean, node.cov]
     99 # self.tree_dicts.append(t_dict)
    101 t_dict = {}

File c:\Users\luke\OneDrive\Dokumente\UniHeidelberg\Master\Semester3\Generative Neural Networks\code\Assignment_1\DensityTree.py:123, in RandomDensityTree.fit(self, data, axis)
    121 self.mean = np.mean(data, axis=0)
    122 self.cov = np.cov(np.transpose(data))
--> 123 self.rootnode = Node(data, self.cov, [], self.tree, num_splits=self.num_splits, min_infogain=self.min_infogain,
    124                      max_depth=self.max_depth, pointer=0, rand=self.rand, splittype=self.splittype)
    125 self.tree[0] = self.rootnode

File c:\Users\luke\OneDrive\Dokumente\UniHeidelberg\Master\Semester3\Generative Neural Networks\code\Assignment_1\DensityTree.py:234, in Node.__init__(self, data, cov, history, tree, num_splits, min_infogain, max_depth, pointer, rand, splittype)
    231     left_data, right_data = split_data_lin(
    232         data, rnd_splits[s]['split'], rnd_splits[s]['direction'])
    233 else:
--> 234     left_data, right_data = split_data_axis(
    235         data, rnd_splits[s]['split'], rnd_splits[s]['direction'])
    237 if (left_data.shape[0] > 2 and right_data.shape[0] > 2):
    239     right_datas.append(right_data)

File c:\Users\luke\OneDrive\Dokumente\UniHeidelberg\Master\Semester3\Generative Neural Networks\code\Assignment_1\DensityTree.py:69, in split_data_axis(data, split, direction)
     67 for d in range(data.shape[0]):
     68     if data[d][direction] <= split:
---> 69         left_data = np.append(left_data, np.reshape(
     70             data[d], (1, data[d].shape[0])), axis=0)
     71     else:
     72         right_data = np.append(right_data, np.reshape(
     73             data[d], (1, data[d].shape[0])), axis=0)

File <__array_function__ internals>:200, in append(*args, **kwargs)

File c:\Users\luke\anaconda3\envs\ml_homework\Lib\site-packages\numpy\lib\function_base.py:5499, in append(arr, values, axis)
   5497     values = ravel(values)
   5498     axis = arr.ndim-1
-> 5499 return concatenate((arr, values), axis=axis)

File <__array_function__ internals>:200, in concatenate(*args, **kwargs)

ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 2 and the array at index 1 has size 64

After a few hours of bug fixing in the code, since it had semantic and syntax errors. We are now at a point, were the networks starts training, but we still run into an additional Error in the Node implementation. Since fixing the semantic error of the Node class and potential further errors and a seperate implementation of the sampling method would be over the scope of the first assignment we decided to skip the density forest implementation and will continue with the single gaussian.

Single Gaussian¶

Now we implement a single Gaussian model, by using the 'GaussianMixture' model from scikit-learn with just one component

In [ ]:
n_components = 1
single_gaussian = GaussianMixture(n_components=n_components)
single_gaussian.fit(X_train)

new_samples, _ = single_gaussian.sample(len(X_test))
result_rbf = MMD(X_test,new_samples, "rbf")
result_inv = MMD(X_test,new_samples, "inverse-multi-quadratic")

print(f"MMD with squared exponential kernel: {result_rbf.item()}")
print(f"MMD with inverse multi quadratic kernel: {result_inv.item()}")

new_samples, _ = single_gaussian.sample(10)
fig, axes = plt.subplots(2, 5, figsize=(10, 5))
for i, ax in enumerate(axes.ravel()):
    ax.imshow(new_samples[i].reshape(8, 8), cmap=plt.cm.gray)
    ax.set_title(f"Sample {i}")
plt.show()

We can see that the single Gaussian is already capable of generating number like images, with some samples indicate legible numbers with higher noise. However, it clearly still lacks the capability to generate decent quality images in the majority of samples. Next, we will analyze the quality of generated dependend on the dataset size

In [ ]:
# Create a list of dataset sizes you want to use
dataset_sizes = [0.1, 0.3, 0.5, 0.8, 1.0]  # Fraction of the original dataset size
rbf_gaussian = []
inv_gaussian = []
var_data_gaussian = GaussianMixture(n_components=1)

for size in dataset_sizes:
    # Split the dataset into a training sets with the desired size
    if size == 1.0:
        X, _, y, _ = train_test_split(X_train, y_train, test_size=None)
    else:
        X, _, y, _ = train_test_split(X_train, y_train, test_size=1 - size)

    # Train the classifier on the current dataset size
    var_data_gaussian.fit(X)
    new_samples, _ = var_data_gaussian.sample(len(X))
    rbf_gaussian.append(MMD(X,new_samples, "rbf"))
    inv_gaussian.append(MMD(X,new_samples, "inverse-multi-quadratic"))
    # Create the plot


plt.figure(figsize=(10, 6))
plt.plot(dataset_sizes, rbf_gaussian, label="RBF MMD")
plt.plot(dataset_sizes, inv_gaussian, label="Inv. Multi-Quadratic MMD")

plt.title("MMD vs. Dataset Size")
plt.xlabel("dataset size")
plt.ylabel("MMD")
plt.legend()
plt.grid(True)

We see, the model improves the more training data it uses

Gaussian mixture model (GMM)¶

Let's continue with training a Gaussian mixture model (GMM) and start by using 50 gaussian components

In [ ]:
n_components = 55
gmm = GaussianMixture(n_components=n_components)
gmm.fit(X_train)
new_samples, _ = gmm.sample(len(X_test))
result_rbf = MMD(X_test,new_samples, "rbf")
result_inv = MMD(X_test,new_samples, "inverse-multi-quadratic")

print(f"MMD with squared exponential kernel: {result_rbf.item()}")
print(f"MMD with inverse multi quadratic kernel: {result_inv.item()}")

# plot generated images
new_samples, _ = gmm.sample(10)
fig, axes = plt.subplots(2, 5, figsize=(10, 5))
for i, ax in enumerate(axes.ravel()):
    ax.imshow(new_samples[i].reshape(8, 8), cmap=plt.cm.gray)
    ax.set_title(f"Sample {i}")
plt.show()

Now we investigate the hyperparameter number of components (n_components) in an grid search attempt to find a good fit

In [ ]:
n_components = list(range(5, 101, 5))
rbf_mmd = []
inv_mmd = []
for _, n in enumerate(n_components):
    n_gmm = GaussianMixture(n_components=n)
    n_gmm.fit(X_train)
    new_samples, _ = n_gmm.sample(len(X_test))
    rbf_mmd.append(MMD(X_test,new_samples, "rbf"))
    inv_mmd.append(MMD(X_test,new_samples, "inverse-multi-quadratic"))

# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(n_components, rbf_mmd, label="RBF MMD")
plt.plot(n_components, inv_mmd, label="Inv. Multi-Quadratic MMD")

plt.title("MMD vs. n_components")
plt.xlabel("n_components")
plt.ylabel("MMD")
plt.legend()
plt.grid(True)

plt.show()

Furthe we test the influence of the dataset size with the optimal n_component size

In [ ]:
# Create a list of dataset sizes you want to use
dataset_sizes = [0.1, 0.3, 0.5, 0.8, 1.0]  # Fraction of the original dataset size
rbf_gmm = []
inv_gmm = []
var_data_gmm = GaussianMixture(n_components=55)

for size in dataset_sizes:
    # Split the dataset into a training sets with the desired size
    if size == 1.0:
        X, _, y, _ = train_test_split(X_train, y_train, test_size=None)
    else:
        X, _, y, _ = train_test_split(X_train, y_train, test_size=1 - size)

    # Train the classifier on the current dataset size
    var_data_gmm.fit(X)
    new_samples, _ = var_data_gmm.sample(len(X))
    rbf_gmm.append(MMD(X,new_samples, "rbf"))
    inv_gmm.append(MMD(X,new_samples, "inverse-multi-quadratic"))
    # Create the plot


plt.figure(figsize=(10, 6))
plt.plot(dataset_sizes, rbf_gmm, label="RBF MMD")
plt.plot(dataset_sizes, inv_gmm, label="Inv. Multi-Quadratic MMD")

plt.title("MMD vs. Dataset Size")
plt.xlabel("dataset size")
plt.ylabel("MMD")
plt.legend()
plt.grid(True)

Also here the model improves with dataset size

Kernel Density Estimator (KDE)¶

Now we implement a kernel density estimator (KDE) with squared exponential kernel

In [ ]:
from sklearn.neighbors import KernelDensity
bandwidth = 1.1
kde = KernelDensity(bandwidth=bandwidth, kernel='gaussian')
kde.fit(X_train)
new_samples = kde.sample(len(X_test))
result_rbf = MMD(X_test,new_samples, "rbf")
result_inv = MMD(X_test,new_samples, "inverse-multi-quadratic")

print(f"MMD with squared exponential kernel: {result_rbf.item()}")
print(f"MMD with inverse multi quadratic kernel: {result_inv.item()}")

new_samples = kde.sample(10)
fig, axes = plt.subplots(2, 5, figsize=(10, 5))
for i, ax in enumerate(axes.ravel()):
    ax.imshow(new_samples[i].reshape(8, 8), cmap=plt.cm.gray)
    ax.set_title(f"Sample {i}")
plt.show()
In [ ]:
bandwidths = np.arange(0.1, 2.6, 0.1)
rbf_mmd = []
inv_mmd = []
for _, bandwidth in enumerate(bandwidths):
    n_kde = KernelDensity(bandwidth=bandwidth, kernel='gaussian')
    n_kde.fit(X_train)
    new_samples = n_kde.sample(len(X_test))
    rbf_mmd.append(MMD(X_test,new_samples, "rbf"))
    inv_mmd.append(MMD(X_test,new_samples, "inverse-multi-quadratic"))

# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(bandwidths, rbf_mmd, label="RBF MMD")
plt.plot(bandwidths, inv_mmd, label="Inv. Multi-Quadratic MMD")

plt.title("MMD vs. bandwidth")
plt.xlabel("bandwith")
plt.ylabel("MMD")
plt.legend()
plt.grid(True)

plt.show()
In [ ]:
# Create a list of dataset sizes you want to use
dataset_sizes = [0.1, 0.3, 0.5, 0.8, 1.0]  # Fraction of the original dataset size
rbf_kde = []
inv_kde = []
var_data_gaussian = KernelDensity(bandwidth=1.1, kernel='gaussian')

for size in dataset_sizes:
    # Split the dataset into a training sets with the desired size
    if size == 1.0:
        X, _, y, _ = train_test_split(X_train, y_train, test_size=None)
    else:
        X, _, y, _ = train_test_split(X_train, y_train, test_size=1 - size)

    # Train the classifier on the current dataset size
    var_data_gaussian.fit(X)
    new_samples = var_data_gaussian.sample(len(X))
    rbf_kde.append(MMD(X,new_samples, "rbf"))
    inv_kde.append(MMD(X,new_samples, "inverse-multi-quadratic"))
    # Create the plot


plt.figure(figsize=(10, 6))
plt.plot(dataset_sizes, rbf_kde, label="RBF MMD")
plt.plot(dataset_sizes, inv_kde, label="Inv. Multi-Quadratic MMD")

plt.title("MMD vs. Dataset Size")
plt.xlabel("dataset size")
plt.ylabel("MMD")
plt.legend()
plt.grid(True)

Random Forest Classifier¶

Finally we implement a Random forest classifier on the original dataset to destinguish the 10 digits. Further we will use this classifier to check if the models are working reasonably and that the 10 digits are generated in equal proportion

In [ ]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100)
rf_classifier.fit(X_train, y_train)
In [ ]:
from sklearn.metrics import accuracy_score

y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on the test data: {accuracy:.2f}")
fig, axes = plt.subplots(2, 5, figsize=(10, 5))
for i, ax in enumerate(axes.ravel()):
    ax.imshow(X_test[i].reshape(8, 8), cmap=plt.cm.gray)
    ax.set_title(f"True: {y_test[i]} Pred: {y_pred[i]}")
plt.show()

y_pred = rf_classifier.predict(X_train)
# Count the frequency of each number from 0 to 9 in y_pred_gauss
count = np.bincount(y_pred)

# Create a bar chart to visualize the frequencies
plt.bar(range(10), count)
plt.xticks(range(10))
plt.xlabel("Predicted Numbers")
plt.ylabel("Frequency")
plt.title("Frequency of Predicted Numbers in the Train Dataset")
plt.show()

We can see, the Classifier does a really good job on classifying the test dataset. Also the frequency of each predicted class in the training dataset is very evanly spaced out. So now, lets test the classifier on generated data samples from each model!

In [ ]:
sample_size = 500
samples_gauss, _ = single_gaussian.sample(sample_size)
samples_gmm, _ = gmm.sample(sample_size)
samples_kde = kde.sample(sample_size)


y_pred_gauss = rf_classifier.predict(samples_gauss)

fig, axes = plt.subplots(2, 5, figsize=(10, 5))
for i, ax in enumerate(axes.ravel()):
    ax.imshow(samples_gauss[i].reshape(8, 8), cmap=plt.cm.gray)
    ax.set_title(f"Pred: {y_pred_gauss[i]}")
fig.suptitle("Predictions for Single Gaussian Samples", fontsize=16)
plt.show()

# Count the frequency of each number from 0 to 9 in y_pred_gauss
count = np.bincount(y_pred_gauss)

# Create a bar chart to visualize the frequencies
plt.bar(range(10), count)
plt.xticks(range(10))
plt.xlabel("Predicted Numbers")
plt.ylabel("Frequency")
plt.title("Frequency of Predicted Numbers")
plt.show()

y_pred_gmm = rf_classifier.predict(samples_gmm)
fig, axes = plt.subplots(2, 5, figsize=(10, 5))
for i, ax in enumerate(axes.ravel()):
    ax.imshow(samples_gmm[i].reshape(8, 8), cmap=plt.cm.gray)
    ax.set_title(f"Pred: {y_pred_gmm[i]}")
fig.suptitle("Predictions for GMM Samples", fontsize=16)
plt.show()

# Count the frequency of each number from 0 to 9 in y_pred_gauss
count = np.bincount(y_pred_gmm)

# Create a bar chart to visualize the frequencies
plt.bar(range(10), count)
plt.xticks(range(10))
plt.xlabel("Predicted Numbers")
plt.ylabel("Frequency")
plt.title("Frequency of Predicted Numbers")
plt.show()

y_pred_kde = rf_classifier.predict(samples_kde)
fig, axes = plt.subplots(2, 5, figsize=(10, 5))
for i, ax in enumerate(axes.ravel()):
    ax.imshow(samples_kde[i].reshape(8, 8), cmap=plt.cm.gray)
    ax.set_title(f"Pred: {y_pred_kde[i]}")
fig.suptitle("Predictions for GMM Samples", fontsize=16)
plt.show()

# Count the frequency of each number from 0 to 9 in y_pred_gauss
count = np.bincount(y_pred_kde)

# Create a bar chart to visualize the frequencies
plt.bar(range(10), count)
plt.xticks(range(10))
plt.xlabel("Predicted Numbers")
plt.ylabel("Frequency")
plt.title("Frequency of Predicted Numbers")
plt.show()

From the plot we can see, that the single Gaussian performs the worst on the training dataset since a disproportinal amount of the generated images get classified as the digit 8. The GMM and the KDE both have substentially better results, where the generated images are more evanly classified throughout the classes, suggesting better generated data samples

In [ ]: